import scanpy as sc #software suite of tools for single-cell analysis in python
import besca as bc #internal BEDA package for single cell analysis
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import anndata as ad
from scipy.sparse import csr_matrix
import scanpy.external as sce
from harmony import harmonize
import umap.umap_ as umap
from matplotlib.pyplot import rc_context
import matplotlib_inline.backend_inline
import warnings
print(ad.__version__)
warnings.filterwarnings("ignore")
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp5kyf02lx INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp5kyf02lx/_remote_module_non_scriptable.py INFO:lightning_fabric.utilities.seed:Global seed set to 0
0.9.1
Displaying result settings required for single cell analysis
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.9.3 anndata==0.9.1 umap==0.3.10 numpy==1.23.5 scipy==1.10.1 pandas==2.0.1 scikit-learn==1.2.2 statsmodels==0.14.0 python-igraph==0.10.4 louvain==0.8.0 pynndescent==0.5.10
#Reading last saved annoatated data object written in h5ad data format.
#We used similar adata variable to make similar previous data analysis
# List of file paths
file_paths = [
'/home/jana/pbmcsarc1.h5ad',
'/home/jana/pbmcsarc2.h5ad',
'/home/jana/pbmcsarc3.h5ad',
'/home/jana/pbmchealth1.h5ad',
'/home/jana/pbmchealth2.h5ad',
'/home/jana/pbmchealth3.h5ad',
'/home/jana/pbmchealth4.h5ad'
]
# List to store loaded data objects
data_objects = []
# Loop to read h5ad files and store data objects
for file_path in file_paths:
data_objects.append(sc.read_h5ad(file_path))
# Unpack data objects to individual variables
pbmcsarc1, pbmcsarc2, pbmcsarc3, pbmchealthy1, pbmchealthy2, pbmchealthy3, pbmchealthy4 = data_objects
Cell cycle Genes Finding
# Read cell cycle genes from file
with open('./data/regev_lab_cell_cycle_genes.txt') as file:
cell_cycle_genes = [x.strip() for x in file]
#print(len(cell_cycle_genes))
print("In the regev-lab dataset cell cycle genes lenghth:", len(cell_cycle_genes))
# Split into S phase and G2/M phase gene lists
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
# Filter cell cycle genes based on availability in different datasets
cell_cycle_genes = [x for x in cell_cycle_genes if x in pbmcsarc1.var_names]
print("Sample pbmcsarc1 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in pbmcsarc2.var_names]
print("Sample pbmcsarc2 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in pbmcsarc3.var_names]
print("Sample pbmcsarc3 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in pbmchealthy1.var_names]
print("Sample pbmchealthy1 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in pbmchealthy2.var_names]
print("Sample pbmchealthy2 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in pbmchealthy3.var_names]
print("Sample pbmchealthy3 cell cycle genes lenghth:", len(cell_cycle_genes))
cell_cycle_genes = [x for x in cell_cycle_genes if x in pbmchealthy4.var_names]
print("Sample pbmchealthy4 cell cycle genes lenghth:", len(cell_cycle_genes))
In the regev-lab dataset cell cycle genes lenghth: 97 Sample pbmcsarc1 cell cycle genes lenghth: 93 Sample pbmcsarc2 cell cycle genes lenghth: 93 Sample pbmcsarc3 cell cycle genes lenghth: 93 Sample pbmchealthy1 cell cycle genes lenghth: 93 Sample pbmchealthy2 cell cycle genes lenghth: 93 Sample pbmchealthy3 cell cycle genes lenghth: 91 Sample pbmchealthy4 cell cycle genes lenghth: 91
Caculating score_genes_cell_cycle: s_genes and g2m_genes
adata_list = [pbmcsarc1, pbmcsarc2, pbmcsarc3, pbmchealthy1, pbmchealthy2, pbmchealthy3, pbmchealthy4]
for adata in adata_list:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
815 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1', 'CDC25C']
finished: added
'G2M_score', score of gene set (adata.obs).
770 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
727 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
774 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
816 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1', 'CDC25C']
finished: added
'G2M_score', score of gene set (adata.obs).
857 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
731 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
943 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
687 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
858 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP', 'E2F8']
finished: added
'S_score', score of gene set (adata.obs).
728 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1', 'NEK2']
finished: added
'G2M_score', score of gene set (adata.obs).
812 total control genes are used. (0:00:00)
--> 'phase', cell cycle phase (adata.obs)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
599 total control genes are used. (0:00:01)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
816 total control genes are used. (0:00:01)
--> 'phase', cell cycle phase (adata.obs)
Violin plot of 'S_score' and 'G2M_score' of all samples
adata_list = [pbmcsarc1, pbmcsarc2, pbmcsarc3, pbmchealthy1, pbmchealthy2, pbmchealthy3, pbmchealthy4]
for adata in adata_list:
sc.pl.violin(adata, keys=['S_score', 'G2M_score'], groupby = 'sample', rotation=90)
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
... storing 'phase' as categorical
Display all samples annotated data
adata_list = [pbmcsarc1, pbmcsarc2, pbmcsarc3, pbmchealthy1, pbmchealthy2,pbmchealthy3, pbmchealthy4]
for adata in adata_list:
print(adata)
AnnData object with n_obs × n_vars = 6962 × 19671
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 9779 × 20394
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 8324 × 18909
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 5921 × 24730
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 4881 × 25757
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 3733 × 22187
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
AnnData object with n_obs × n_vars = 11808 × 27363
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'hvg', 'leiden', 'leiden_res0_20_colors', 'leiden_res0_40_colors', 'leiden_res0_60_colors', 'leiden_res0_80_colors', 'leiden_res1_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'connectivities', 'distances'
saving the all files after cell score computing
save_files = [
'/home/jana/pbmcsarc1.h5ad',
'/home/jana/pbmcsarc2.h5ad',
'/home/jana/pbmcsarc3.h5ad',
'/home/jana/pbmchealth1.h5ad',
'/home/jana/pbmchealth2.h5ad',
'/home/jana/pbmchealth3.h5ad',
'/home/jana/pbmchealth4.h5ad'
]
adata_list = [pbmcsarc1, pbmcsarc2, pbmcsarc3, pbmchealthy1, pbmchealthy2, pbmchealthy3, pbmchealthy4]
# Save each adata to the corresponding file
for adata, save_file in zip(adata_list, save_files):
adata.write_h5ad(save_file)
from matplotlib.pyplot import rc_context
import matplotlib_inline.backend_inline
import warnings
# Suppress UserWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# Suppress DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Set matplotlib formats using the updated method
matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
sc.set_figure_params(dpi=100)
with rc_context({'figure.figsize': (4, 4)}):
sc.pl.umap(pbmcsarc1, color = 'leiden')
sc.pl.umap(pbmcsarc2, color = 'leiden')
sc.pl.umap(pbmcsarc3, color = 'leiden')
sc.pl.umap(pbmchealthy1, color = 'leiden')
sc.pl.umap(pbmchealthy2, color = 'leiden')
sc.pl.umap(pbmchealthy3, color = 'leiden')
sc.pl.umap(pbmchealthy4, color = 'leiden')
Overlapping Genes finding across all samples
var_names = pbmchealthy4.var_names.intersection(pbmcsarc1.var_names)
var_names = var_names.intersection(pbmcsarc2.var_names)
var_names = var_names.intersection(pbmcsarc3.var_names)
var_names = var_names.intersection(pbmchealthy1.var_names)
var_names = var_names.intersection(pbmchealthy2.var_names)
var_names = var_names.intersection(pbmchealthy3.var_names)
print("Overlapping Genes length across all samples:", len(var_names))
Overlapping Genes length across all samples: 17178
Overlapping Genes as var_names assigned to all samples
pbmcsarc2 = pbmcsarc2[:, var_names]
pbmcsarc1 = pbmcsarc1[:, var_names]
pbmcsarc3 = pbmcsarc3[:, var_names]
pbmchealthy1 = pbmchealthy1[:, var_names]
pbmchealthy2 = pbmchealthy2[:, var_names]
pbmchealthy3 = pbmchealthy3[:, var_names]
pbmchealthy4 = pbmchealthy4[:, var_names]
Ingestion: This function combines embeddings and annotations from an adata object with those from a reference dataset. Since our observation 'pbmchealthy4' exhibits the highest number of Leiden clusters, we selected it as the reference dataset.
%%time
import numba
import warnings
# Suppress NumbaPerformanceWarning
warnings.filterwarnings("ignore", category=numba.NumbaPerformanceWarning)
sc.tl.ingest(pbmcsarc1, pbmchealthy4, obs= 'leiden')
sc.tl.ingest(pbmcsarc2, pbmchealthy4, obs= 'leiden')
sc.tl.ingest(pbmcsarc3, pbmchealthy4, obs= 'leiden')
sc.tl.ingest(pbmchealthy1, pbmchealthy4, obs= 'leiden')
sc.tl.ingest(pbmchealthy2, pbmchealthy4, obs= 'leiden')
sc.tl.ingest(pbmchealthy3, pbmchealthy4, obs= 'leiden')
running ingest
finished (0:00:32)
running ingest
finished (0:00:24)
running ingest
finished (0:00:22)
running ingest
finished (0:00:17)
running ingest
finished (0:00:17)
running ingest
finished (0:00:14)
CPU times: user 2min 51s, sys: 43 s, total: 3min 34s
Wall time: 2min 21s
from matplotlib.pyplot import rc_context
sc.set_figure_params(dpi=100)
with rc_context({'figure.figsize': (4, 4)}):
sc.pl.umap(pbmcsarc1, color = 'leiden')
sc.pl.umap(pbmcsarc2, color = 'leiden')
sc.pl.umap(pbmcsarc3, color = 'leiden')
sc.pl.umap(pbmchealthy1, color = 'leiden')
sc.pl.umap(pbmchealthy2, color = 'leiden')
sc.pl.umap(pbmchealthy3, color = 'leiden')
sc.pl.umap(pbmchealthy4, color = 'leiden')
#we used adata as merged object
adata = pbmcsarc1.concatenate(pbmcsarc2, pbmcsarc3, pbmchealthy1, pbmchealthy2, pbmchealthy3, pbmchealthy4)
Displaying merged object observation and variables types
# Displaying merged object observation and variables types
print (adata)
print ("------###---")
#Displaying counts cells sample wise
display(adata.obs['sample'].value_counts())
#Displaying counts total cells counts of healthy and Sarcoid (Sarc)
print ("------###---")
display(adata.obs['type'].value_counts())
AnnData object with n_obs × n_vars = 51408 × 17178
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch'
var: 'gene_ids', 'feature_types', 'mt', 'ribo', 'n_cells-0', 'n_cells_by_counts-0', 'mean_counts-0', 'pct_dropout_by_counts-0', 'total_counts-0', 'highly_variable-0', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'mean-0', 'std-0', 'n_cells-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'highly_variable-1', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'mean-1', 'std-1', 'n_cells-2', 'n_cells_by_counts-2', 'mean_counts-2', 'pct_dropout_by_counts-2', 'total_counts-2', 'highly_variable-2', 'means-2', 'dispersions-2', 'dispersions_norm-2', 'mean-2', 'std-2', 'n_cells-3', 'n_cells_by_counts-3', 'mean_counts-3', 'pct_dropout_by_counts-3', 'total_counts-3', 'highly_variable-3', 'means-3', 'dispersions-3', 'dispersions_norm-3', 'mean-3', 'std-3', 'n_cells-4', 'n_cells_by_counts-4', 'mean_counts-4', 'pct_dropout_by_counts-4', 'total_counts-4', 'highly_variable-4', 'means-4', 'dispersions-4', 'dispersions_norm-4', 'mean-4', 'std-4', 'n_cells-5', 'n_cells_by_counts-5', 'mean_counts-5', 'pct_dropout_by_counts-5', 'total_counts-5', 'highly_variable-5', 'means-5', 'dispersions-5', 'dispersions_norm-5', 'mean-5', 'std-5', 'n_cells-6', 'n_cells_by_counts-6', 'mean_counts-6', 'pct_dropout_by_counts-6', 'total_counts-6', 'highly_variable-6', 'means-6', 'dispersions-6', 'dispersions_norm-6', 'mean-6', 'std-6'
obsm: 'X_pca', 'X_umap'
------###---
sample PBMC-healthy-4 11808 PBMC-Sarc-2 9779 PBMC-Sarc-3 8324 PBMC-Sarc-1 6962 PBMC-healthy-1 5921 PBMC-healthy-2 4881 PBMC-healthy-3 3733 Name: count, dtype: int64
------###---
type Healthy 26343 Sarcoidosis 25065 Name: count, dtype: int64
# Print merged adata var (variable) types
display (adata.var)
| gene_ids | feature_types | mt | ribo | n_cells-0 | n_cells_by_counts-0 | mean_counts-0 | pct_dropout_by_counts-0 | total_counts-0 | highly_variable-0 | ... | n_cells_by_counts-6 | mean_counts-6 | pct_dropout_by_counts-6 | total_counts-6 | highly_variable-6 | means-6 | dispersions-6 | dispersions_norm-6 | mean-6 | std-6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AL627309.1 | ENSG00000238009 | Gene Expression | False | False | 23 | 23 | 0.003543 | 99.674082 | 25.0 | False | ... | 111 | 0.009610 | 99.072449 | 115.0 | False | 0.008591 | 0.687018 | 1.043134 | -6.657920e-12 | 0.063785 |
| AL627309.5 | ENSG00000241860 | Gene Expression | False | False | 267 | 267 | 0.039960 | 96.216523 | 282.0 | True | ... | 654 | 0.058996 | 94.534971 | 706.0 | False | 0.041723 | -0.006046 | -0.674421 | -8.028711e-11 | 0.132921 |
| LINC01409 | ENSG00000237491 | Gene Expression | False | False | 287 | 287 | 0.043645 | 95.933116 | 308.0 | False | ... | 1430 | 0.156430 | 88.050472 | 1872.0 | False | 0.117513 | 0.346010 | 0.198046 | 7.697924e-11 | 0.237550 |
| LINC01128 | ENSG00000228794 | Gene Expression | False | False | 197 | 197 | 0.029333 | 97.208446 | 207.0 | False | ... | 941 | 0.090833 | 92.136709 | 1087.0 | False | 0.077252 | 0.335086 | 0.170974 | 4.148799e-11 | 0.192739 |
| LINC00115 | ENSG00000225880 | Gene Expression | False | False | 76 | 76 | 0.010911 | 98.923055 | 77.0 | True | ... | 127 | 0.010863 | 98.938748 | 130.0 | False | 0.008700 | -0.005179 | -0.672274 | -5.707064e-12 | 0.062773 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| AC011043.1 | ENSG00000276256 | Gene Expression | False | False | 3 | 3 | 0.000425 | 99.957489 | 3.0 | False | ... | 31 | 0.002674 | 99.740954 | 32.0 | False | 0.002064 | -0.020411 | -0.710021 | -5.699130e-12 | 0.030502 |
| AL354822.1 | ENSG00000278384 | Gene Expression | False | False | 167 | 167 | 0.024798 | 97.633555 | 175.0 | False | ... | 84 | 0.007103 | 99.298070 | 85.0 | False | 0.005738 | 0.138699 | -0.315713 | 1.906735e-11 | 0.051172 |
| AL592183.1 | ENSG00000273748 | Gene Expression | False | False | 61 | 61 | 0.008786 | 99.135610 | 62.0 | False | ... | 4493 | 0.627726 | 62.455085 | 7512.0 | False | 0.424501 | 0.471774 | -0.027916 | 1.760747e-10 | 0.443237 |
| AC240274.1 | ENSG00000271254 | Gene Expression | False | False | 43 | 43 | 0.006518 | 99.390676 | 46.0 | False | ... | 473 | 0.043954 | 96.047464 | 526.0 | False | 0.033923 | 0.150248 | -0.287093 | -8.439322e-12 | 0.125632 |
| AC007325.4 | ENSG00000278817 | Gene Expression | False | False | 44 | 44 | 0.006377 | 99.376506 | 45.0 | False | ... | 152 | 0.012785 | 98.729840 | 153.0 | False | 0.009369 | -0.030823 | -0.735824 | -1.864526e-11 | 0.063992 |
17178 rows × 81 columns
# Print merged adata obs observation types
display (adata.obs)
| type | sample | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | total_counts_ribo | ... | leiden | leiden_res0_20 | leiden_res0_40 | leiden_res0_60 | leiden_res0_80 | leiden_res1 | S_score | G2M_score | phase | batch | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCCAAGACATAAC-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0.341880 | 0.0 | 385 | 385 | 585.0 | 27.0 | 4.615385 | 32.0 | ... | 0 | 8 | 14 | 17 | 15 | 16 | -0.042634 | -0.017283 | G1 | 0 |
| AAACCCAAGAGGCGGA-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0.125990 | 0.0 | 2191 | 2191 | 5556.0 | 423.0 | 7.613391 | 613.0 | ... | 1 | 0 | 0 | 11 | 12 | 11 | 0.004073 | -0.077179 | S | 0 |
| AAACCCAAGCGTACAG-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0.104749 | 0.0 | 936 | 936 | 2864.0 | 253.0 | 8.833798 | 1131.0 | ... | 1 | 3 | 3 | 4 | 6 | 6 | -0.052329 | -0.069784 | G1 | 0 |
| AAACCCAAGGTACAAT-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0.172697 | 0.0 | 3622 | 3620 | 11579.0 | 736.0 | 6.356335 | 1679.0 | ... | 17 | 0 | 0 | 8 | 8 | 8 | 0.009910 | -0.103200 | S | 0 |
| AAACCCACAGCGTACC-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0.160607 | 0.0 | 2219 | 2219 | 6849.0 | 536.0 | 7.825960 | 1114.0 | ... | 11 | 4 | 4 | 3 | 5 | 5 | -0.062317 | -0.104943 | G1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTTGTTGGTTGGATCT-1-6 | Healthy | PBMC-healthy-4 | 0.150765 | 2.0 | 2795 | 2795 | 9286.0 | 391.0 | 4.210640 | 2746.0 | ... | 10 | 2 | 2 | 3 | 3 | 10 | -0.068829 | 0.012108 | G2M | 6 |
| TTTGTTGGTTTCTTAC-1-6 | Healthy | PBMC-healthy-4 | 0.115224 | 1.0 | 2891 | 2891 | 6943.0 | 211.0 | 3.039032 | 952.0 | ... | 14 | 5 | 5 | 4 | 5 | 14 | -0.030879 | -0.003356 | G1 | 6 |
| TTTGTTGTCCATTTCA-1-6 | Healthy | PBMC-healthy-4 | 0.170916 | 2.0 | 2539 | 2538 | 7020.0 | 376.0 | 5.356125 | 1831.0 | ... | 4 | 1 | 1 | 2 | 4 | 4 | 0.005200 | -0.003864 | S | 6 |
| TTTGTTGTCTACACAG-1-6 | Healthy | PBMC-healthy-4 | 0.082936 | 1.0 | 3581 | 3581 | 9646.0 | 292.0 | 3.027162 | 637.0 | ... | 13 | 6 | 7 | 9 | 12 | 13 | -0.002597 | -0.035551 | G1 | 6 |
| TTTGTTGTCTCATTAC-1-6 | Healthy | PBMC-healthy-4 | 0.147313 | 1.0 | 4647 | 4647 | 15613.0 | 487.0 | 3.119196 | 2341.0 | ... | 19 | 4 | 6 | 6 | 17 | 19 | -0.036991 | -0.025612 | G1 | 6 |
51408 rows × 21 columns
UMAP after sample integration
sc.pl.umap(adata, color="sample")
sc.pl.umap(adata, color="leiden")
#Doublet detetection using Scrublet
import warnings
# Suppress RuntimeWarning for invalid value encountered in log
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="invalid value encountered in log")
#warnings.filterwarnings("ignore", message="invalid value encountered in sqrt")
print ("Doublet detection")
import scrublet as scr
# split per batch into new objects.
batches = adata.obs['sample'].cat.categories.tolist()
alldata = {}
for batch in batches:
tmp = adata[adata.obs['sample'] == batch,]
print(batch, ":", tmp.shape[0], " cells")
scrub = scr.Scrublet(tmp.raw.X, expected_doublet_rate = 0.06)
out = scrub.scrub_doublets(verbose=False, min_counts=2, min_cells=3,min_gene_variability_pctl=85,n_prin_comps=20)
alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")
print(round(alldata[batch].predicted_doublets.sum()/tmp.shape[0]*100,2), " predicted doublet Percentage\n")
Doublet detection PBMC-Sarc-1 : 6962 cells 245 predicted_doublets 3.52 predicted doublet Percentage PBMC-Sarc-2 : 9779 cells 524 predicted_doublets 5.36 predicted doublet Percentage PBMC-Sarc-3 : 8324 cells 157 predicted_doublets 1.89 predicted doublet Percentage PBMC-healthy-1 : 5921 cells 127 predicted_doublets 2.14 predicted doublet Percentage PBMC-healthy-2 : 4881 cells 111 predicted_doublets 2.27 predicted doublet Percentage PBMC-healthy-3 : 3733 cells 22 predicted_doublets 0.59 predicted doublet Percentage PBMC-healthy-4 : 11808 cells 293 predicted_doublets 2.48 predicted doublet Percentage
Histogram plot of doublet detection
#Histogram plot doublet detection
scrub.plot_histogram();
Doublet detector predictions adding to the adata object
# Doublet detector predictions adding to the adata object.
scrub_pred = pd.concat(alldata.values())
adata.obs['doublet_scores'] = scrub_pred['doublet_score']
adata.obs['predicted_doublets'] = scrub_pred['predicted_doublets']
print("Total predicted doublets of all samples:", sum(adata.obs['predicted_doublets']))
Total predicted doublets of all samples: 1479
True means: confirmed doublets and False means: singlets
# add in column with singlet/doublet instead of True/False
%matplotlib inline
adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)
sc.pl.violin(adata, 'n_genes_by_counts',
jitter=0.4, groupby = 'doublet_info', rotation=45)
... storing 'doublet_info' as categorical
Displaying Doublet scores, Doublet info and Sample wise distrubtion
#Displaying Doublet scores, Doublet info and Sample wise distrubtion
print ("Doublet finding plots with scores and info across the samples")
sc.pl.umap(adata, color=['doublet_scores','doublet_info','sample'])
Doublet finding plots with scores and info across the samples
Doublet removing and Rest samples for post processing
#Doublet removing and Rest samples for post processing
print ("Considering the False Doublet finding information, means we are considering non doublet portion for the rest of tha analysis")
# also revert back to the raw counts as the main matrix in adata
adata = adata.raw.to_adata()
adata = adata[adata.obs['doublet_info'] == 'False',:]
print("Current shape of the data:", adata.shape)
Considering the False Doublet finding information, means we are considering non doublet portion for the rest of tha analysis Current shape of the data: (49929, 17178)
Displaying merged object
adata
View of AnnData object with n_obs × n_vars = 49929 × 17178
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info'
uns: 'sample_colors', 'leiden_colors', 'doublet_info_colors'
obsm: 'X_pca', 'X_umap'
We conducted a comparison between our dataset and two existing methods: BBKN (Batch balanced KNN) introduced in the paper published in 2019 in Bioinformatics, and Harmony, presented in the 2019 Nature paper.
BBKN: Batch balanced KNN method
%%time
# Copy adata not to modify UMAP in the original adata object
adata_temp=adata.copy()
sc.tl.pca(adata_temp)
#%%time
sc.external.pp.bbknn(adata_temp, batch_key='sample')
sc.tl.umap(adata_temp)
sc.pl.umap(adata_temp, color=['sample', 'leiden'])
del adata_temp
computing PCA
with n_comps=50
finished (0:01:31)
computing batch balanced neighbors
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:20)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:29)
CPU times: user 8min 51s, sys: 9min 38s, total: 18min 30s Wall time: 3min 26s
Harmony method
%%time
# Copy adata not to modify UMAP in the original adata object
adata_temp=adata.copy()
sc.tl.pca(adata_temp, n_comps=20, svd_solver='arpack')
sce.pp.harmony_integrate(adata_temp, 'sample')
def post_process_harmonization(adata_temp):
print("Post-processing of Harmonization")
# Set current PCA to be aligned with Harmonized PCA values
adata_temp.obsm['X_pca'] = adata_temp.obsm['X_pca_harmony']
# Compute neighborhood graphs and clustering
print("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata_temp, n_neighbors=10, n_pcs=20)
# Cluster the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata_temp)
sc.tl.umap(adata_temp)
# Example usage
post_process_harmonization(adata_temp)
sc.tl.umap(adata_temp)
sc.pl.umap(adata_temp, color=['sample', 'leiden'])
del adata_temp
computing PCA
with n_comps=20
finished (0:00:35)
2024-02-23 17:11:34,723 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... INFO:harmonypy:Computing initial centroids with sklearn.KMeans... 2024-02-23 17:11:50,081 - harmonypy - INFO - sklearn.KMeans initialization complete. INFO:harmonypy:sklearn.KMeans initialization complete. 2024-02-23 17:11:50,823 - harmonypy - INFO - Iteration 1 of 10 INFO:harmonypy:Iteration 1 of 10 2024-02-23 17:12:23,206 - harmonypy - INFO - Iteration 2 of 10 INFO:harmonypy:Iteration 2 of 10 2024-02-23 17:12:56,056 - harmonypy - INFO - Iteration 3 of 10 INFO:harmonypy:Iteration 3 of 10 2024-02-23 17:13:29,231 - harmonypy - INFO - Iteration 4 of 10 INFO:harmonypy:Iteration 4 of 10 2024-02-23 17:14:02,262 - harmonypy - INFO - Iteration 5 of 10 INFO:harmonypy:Iteration 5 of 10 2024-02-23 17:14:34,792 - harmonypy - INFO - Iteration 6 of 10 INFO:harmonypy:Iteration 6 of 10 2024-02-23 17:15:07,531 - harmonypy - INFO - Iteration 7 of 10 INFO:harmonypy:Iteration 7 of 10 2024-02-23 17:15:20,672 - harmonypy - INFO - Iteration 8 of 10 INFO:harmonypy:Iteration 8 of 10 2024-02-23 17:15:30,121 - harmonypy - INFO - Iteration 9 of 10 INFO:harmonypy:Iteration 9 of 10 2024-02-23 17:15:39,356 - harmonypy - INFO - Iteration 10 of 10 INFO:harmonypy:Iteration 10 of 10 2024-02-23 17:15:49,224 - harmonypy - INFO - Stopped before convergence INFO:harmonypy:Stopped before convergence
Post-processing of Harmonization
Computing neighborhood graphs and Clustering
computing neighbors
using 'X_pca' with n_pcs = 20
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:15)
running Leiden clustering
finished: found 27 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:11)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:01)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:59)
CPU times: user 49min 22s, sys: 1h 9min 19s, total: 1h 58min 42s Wall time: 7min 21s
Violin plot of 'G2M_score', 'S_score' across samples
sc.pl.violin(adata, keys=['G2M_score', 'S_score'], groupby='sample',rotation=90)
PCA plot of 'G2M_score', 'S_score' and 'phase' across samples
sc.pl.pca(adata, color= ['S_score', 'G2M_score','phase'])
Display merged object adata
adata
AnnData object with n_obs × n_vars = 49929 × 17178
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info'
uns: 'sample_colors', 'leiden_colors', 'doublet_info_colors', 'phase_colors'
obsm: 'X_pca', 'X_umap'
Harmony method was appiled into actual merged object
%%time
# Copy adata not to modify UMAP in the original adata object
sc.tl.pca(adata, n_comps=20, svd_solver='arpack')
sce.pp.harmony_integrate(adata, 'sample')
def post_process_harmonization(adata):
print("Post-processing of Harmonization")
# Set current PCA to be aligned with Harmonized PCA values
adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']
# Compute neighborhood graphs and clustering
print("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata, n_neighbors=12, n_pcs=20)
# Cluster the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata)
sc.tl.umap(adata)
# usage
post_process_harmonization(adata)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['sample', 'leiden'])
computing PCA
with n_comps=20
finished (0:00:35)
2024-02-23 17:19:01,814 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... INFO:harmonypy:Computing initial centroids with sklearn.KMeans... 2024-02-23 17:19:16,903 - harmonypy - INFO - sklearn.KMeans initialization complete. INFO:harmonypy:sklearn.KMeans initialization complete. 2024-02-23 17:19:17,649 - harmonypy - INFO - Iteration 1 of 10 INFO:harmonypy:Iteration 1 of 10 2024-02-23 17:19:51,528 - harmonypy - INFO - Iteration 2 of 10 INFO:harmonypy:Iteration 2 of 10 2024-02-23 17:20:25,093 - harmonypy - INFO - Iteration 3 of 10 INFO:harmonypy:Iteration 3 of 10 2024-02-23 17:20:58,328 - harmonypy - INFO - Iteration 4 of 10 INFO:harmonypy:Iteration 4 of 10 2024-02-23 17:21:32,122 - harmonypy - INFO - Iteration 5 of 10 INFO:harmonypy:Iteration 5 of 10 2024-02-23 17:22:04,756 - harmonypy - INFO - Iteration 6 of 10 INFO:harmonypy:Iteration 6 of 10 2024-02-23 17:22:38,589 - harmonypy - INFO - Iteration 7 of 10 INFO:harmonypy:Iteration 7 of 10 2024-02-23 17:22:54,770 - harmonypy - INFO - Iteration 8 of 10 INFO:harmonypy:Iteration 8 of 10 2024-02-23 17:23:04,714 - harmonypy - INFO - Iteration 9 of 10 INFO:harmonypy:Iteration 9 of 10 2024-02-23 17:23:14,337 - harmonypy - INFO - Iteration 10 of 10 INFO:harmonypy:Iteration 10 of 10 2024-02-23 17:23:24,225 - harmonypy - INFO - Stopped before convergence INFO:harmonypy:Stopped before convergence
Post-processing of Harmonization
Computing neighborhood graphs and Clustering
computing neighbors
using 'X_pca' with n_pcs = 20
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:11)
running Leiden clustering
finished: found 25 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:10)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:10)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:06)
CPU times: user 50min 29s, sys: 1h 11min 19s, total: 2h 1min 49s Wall time: 7min 39s
UMAP using rc_context method
with rc_context({'figure.figsize': (4, 4)}):
sc.pl.umap(adata, color = 'sample')
import pandas as pd
# Assuming 'merged' is your DataFrame
counts = adata.obs.groupby(['sample', 'leiden']).count().reset_index()
def map_per(row):
samp, y = row['sample'], row['total_counts']
tot = counts.loc[counts['sample'] == samp, 'total_counts'].sum()
return (y / tot) * 100 if tot != 0 else 0
counts['per'] = counts.apply(map_per, axis=1)
# Display the resulting DataFrame
display(counts)
| sample | leiden | type | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | ... | leiden_res0_80 | leiden_res1 | S_score | G2M_score | phase | batch | doublet_scores | predicted_doublets | doublet_info | per | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PBMC-Sarc-1 | 0 | 751 | 751 | 751 | 751 | 751 | 751 | 751 | 751 | ... | 751 | 751 | 751 | 751 | 751 | 751 | 751 | 751 | 751 | 11.180587 |
| 1 | PBMC-Sarc-1 | 1 | 858 | 858 | 858 | 858 | 858 | 858 | 858 | 858 | ... | 858 | 858 | 858 | 858 | 858 | 858 | 858 | 858 | 858 | 12.773560 |
| 2 | PBMC-Sarc-1 | 2 | 741 | 741 | 741 | 741 | 741 | 741 | 741 | 741 | ... | 741 | 741 | 741 | 741 | 741 | 741 | 741 | 741 | 741 | 11.031711 |
| 3 | PBMC-Sarc-1 | 3 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | ... | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 739 | 11.001935 |
| 4 | PBMC-Sarc-1 | 4 | 493 | 493 | 493 | 493 | 493 | 493 | 493 | 493 | ... | 493 | 493 | 493 | 493 | 493 | 493 | 493 | 493 | 493 | 7.339586 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 170 | PBMC-healthy-4 | 20 | 130 | 130 | 130 | 130 | 130 | 130 | 130 | 130 | ... | 130 | 130 | 130 | 130 | 130 | 130 | 130 | 130 | 130 | 1.128962 |
| 171 | PBMC-healthy-4 | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.000000 |
| 172 | PBMC-healthy-4 | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.000000 |
| 173 | PBMC-healthy-4 | 23 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | ... | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 33 | 0.286583 |
| 174 | PBMC-healthy-4 | 24 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | ... | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0.026053 |
175 rows × 25 columns
Plot of Percentage of Total Counts across leiden clusters
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (10,3))
ax = sns.barplot(x = 'leiden', y = 'per', hue = 'sample', data = counts)
ax.set(xlabel='Leiden',
ylabel='Percenatge_total_counts')
plt.legend(bbox_to_anchor=(1.4,0.8))
<matplotlib.legend.Legend at 0x7f80accccfd0>
#saving into CSV data
counts.to_csv('/home/jana/Merged_object_metadata.csv')
Plot of Percentage of Total Counts across samples
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 3))
ax = sns.barplot(x='sample', y='per', hue='leiden', data=counts)
ax.set(xlabel='sample ',
ylabel='Percentage_total_counts')
plt.legend(bbox_to_anchor=(1.4, 0.8))
# Rotate x-axis labels by 90 degrees
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()
Sample wise density estimation
#Gaussian kernel density estimation is used to calculate the density of cells in an embedded space.
#This can be performed per category over a categorical cell annotation.
#Calcuting the density per sample
sc.tl.embedding_density(adata, groupby='sample')
#plot the density per sample
sc.pl.embedding_density(adata, groupby='sample')
computing density on 'umap'
--> added
'umap_density_sample', densities (adata.obs)
'umap_density_sample_params', parameter (adata.uns)
Sample wise density estimation across leiden clusters
#Gaussian kernel density estimation is used to calculate the density of cells in an embedded space.
#This can be performed per category over a categorical cell annotation.
#Calcuting the density per sample
sc.tl.embedding_density(adata, groupby='leiden')
#plot the density per sample
sc.pl.embedding_density(adata, groupby='leiden')
computing density on 'umap'
--> added
'umap_density_leiden', densities (adata.obs)
'umap_density_leiden_params', parameter (adata.uns)
#Computing with a series of resolution parameters and silhouette_scores.
#Like various algorithms, Leiden has also a parameter named the resolution.
#It can control the coarseness of the clustering.
#Higher values of resolution mean it leads to more clusters.
#Computing Silhouette Coefficient or Silhouette Score, a metric that was used to calculate the goodness of a clustering.
# -1 <= silhouette score<= 1.
# Copy adata not to modify UMAP in the original adata object
adata_temp=adata.copy()
from sklearn.metrics import silhouette_score
# Define a list of resolution parameters
resolutions = [round(r, 2) for r in [.05] + list(np.linspace(.1, 1.6, 16))]
# Print a message indicating the start of the computation
print("Computing silhouette scores with different resolution parameters")
# Iterate over each resolution parameter and compute the silhouette score
for resolution in resolutions:
# Apply the Leiden clustering algorithm with the current resolution parameter
sc.tl.leiden(adata_temp, resolution=resolution)
# Compute the silhouette score for the clustering result
silhouette = silhouette_score(adata_temp.obsm['X_umap'], adata_temp.obs[f'leiden'], metric='euclidean')
# Print the silhouette score for the current resolution parameter
print(f"Silhouette score for resolution {resolution}: {silhouette}\n")
#delete temporary adata_temp
del adata_temp
Computing silhouette scores with different resolution parameters
running Leiden clustering
finished: found 6 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:06)
Silhouette score for resolution 0.05: 0.47803694009780884
running Leiden clustering
finished: found 8 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:05)
Silhouette score for resolution 0.1: 0.4228675961494446
running Leiden clustering
finished: found 9 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:05)
Silhouette score for resolution 0.2: 0.423541784286499
running Leiden clustering
finished: found 13 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:06)
Silhouette score for resolution 0.3: 0.33491015434265137
running Leiden clustering
finished: found 13 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:05)
Silhouette score for resolution 0.4: 0.314763605594635
running Leiden clustering
finished: found 15 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:08)
Silhouette score for resolution 0.5: 0.333285927772522
running Leiden clustering
finished: found 19 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:16)
Silhouette score for resolution 0.6: 0.22321264445781708
running Leiden clustering
finished: found 19 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:12)
Silhouette score for resolution 0.7: 0.259266197681427
running Leiden clustering
finished: found 21 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:08)
Silhouette score for resolution 0.8: 0.24786916375160217
running Leiden clustering
finished: found 24 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:10)
Silhouette score for resolution 0.9: 0.2332153469324112
running Leiden clustering
finished: found 25 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:10)
Silhouette score for resolution 1.0: 0.2215575873851776
running Leiden clustering
finished: found 27 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:14)
Silhouette score for resolution 1.1: 0.21165627241134644
running Leiden clustering
finished: found 30 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:19)
Silhouette score for resolution 1.2: 0.16172157227993011
running Leiden clustering
finished: found 34 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:14)
Silhouette score for resolution 1.3: 0.17167407274246216
running Leiden clustering
finished: found 35 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:10)
Silhouette score for resolution 1.4: 0.16716261208057404
running Leiden clustering
finished: found 37 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:15)
Silhouette score for resolution 1.5: 0.15319690108299255
running Leiden clustering
finished: found 38 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:12)
Silhouette score for resolution 1.6: 0.18699465692043304
Plot of Percentage of Total Counts, samples, clusters and average silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
# Assuming you have already loaded your data into variables adata and adata_temp
adata_temp=adata.copy()
# Calculate silhouette score
silhouette_avg = silhouette_score(adata.obsm['X_umap'], adata.obs[f'leiden'], metric='euclidean')
print("The average silhouette_score is :", silhouette_avg)
# Plot
plt.figure(figsize=(10, 3))
ax = sns.barplot(x='sample', y='per', hue='leiden', data=counts)
ax.set(xlabel='sample ', ylabel='Percentage_total_counts')
plt.legend(bbox_to_anchor=(1.4, 0.8))
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()
The average silhouette_score is : 0.22155759
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score
# Assuming you have already loaded your data into variables adata and adata_temp
# Calculate silhouette scores for each sample
silhouette_scores = silhouette_samples(adata.obsm['X_umap'], np.array(adata.obs[f'leiden']), metric='euclidean')
# Calculate the overall silhouette score
silhouette_avg = silhouette_score(adata.obsm['X_umap'], np.array(adata.obs[f'leiden']), metric='euclidean')
# Get sample names
sample_names = adata.obs['sample'] # Assuming 'sample' is the column name containing sample names
# Create a bar plot of silhouette scores with sample names on the x-axis
plt.figure(figsize=(15, 6)) # Adjusted figure size for better readability
ax = sns.barplot(x=sample_names, y=silhouette_scores, palette='viridis')
# Add a horizontal line for the average silhouette score
plt.axhline(y=silhouette_avg, color="red", linestyle="--")
plt.title('Silhouette plot')
plt.xlabel('Samples')
plt.ylabel('Silhouette score')
plt.xticks(rotation=90) # Rotate x-axis labels for better readability
plt.legend(bbox_to_anchor=(1.4, 0.8))
plt.show()
WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
The keys corresponding to silhouette scores are added to the merged objects. We opted for three resolutions, namely 0.5, 0.6, and 1.0 (default), for Leiden clustering
#clustering comparison
#Based on Silhouette scores, We selected three resolutions 0.05 and 0.2, 0.5, and 1.0 (default) for leiden clustering
#default resolution 1.0 with Silhouette score: 0.22, leiden clustering and key is added
sc.tl.leiden(adata, key_added="leiden_1.0")
#Resolution 0.5 with with Silhouette score: 0.33, leiden clustering and key is added
sc.tl.leiden(adata, resolution = 0.5, key_added = "leiden_0.5")
#Resolution 0.2 with with Silhouette score: 0.42, leiden clustering and key is added
sc.tl.leiden(adata, resolution = 0.2, key_added = "leiden_0.2")
#Resolution 0.05 with with Silhouette score: 0.48, leiden clustering and key is added
sc.tl.leiden(adata, resolution = 0.05, key_added = "leiden_0.05")
running Leiden clustering
finished: found 25 clusters and added
'leiden_1.0', the cluster labels (adata.obs, categorical) (0:00:13)
running Leiden clustering
finished: found 15 clusters and added
'leiden_0.5', the cluster labels (adata.obs, categorical) (0:00:11)
running Leiden clustering
finished: found 9 clusters and added
'leiden_0.2', the cluster labels (adata.obs, categorical) (0:00:07)
running Leiden clustering
finished: found 6 clusters and added
'leiden_0.05', the cluster labels (adata.obs, categorical) (0:00:06)
clustering UMAP plot based Silhouette scores in decresing order
#clustering plot based Silhouette scores in decresing order
sc.pl.umap(adata, color=['leiden_1.0','leiden_0.5', 'leiden_0.2','leiden_0.05'], legend_loc="on data")
#import scipy io package
from scipy import io
save_file = '/home/jana/Updated_SCANPY_QC_filtered_PBMC_for_Sarcoidosis.h5ad'
adata.write_h5ad(save_file)
Percentage Checking of Total Counts checking inside leiden_0.5
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Assuming 'merged' is your DataFrame
counts_05 = adata.obs.groupby(['sample', 'leiden_0.5']).count().reset_index()
def map_per(row):
samp, y = row['sample'], row['total_counts']
tot05 = counts_05.loc[counts_05['sample'] == samp, 'total_counts'].sum()
return (y / tot05) * 100 if tot05 != 0 else 0
counts_05['pernew'] = counts_05.apply(map_per, axis=1)
# Display the resulting DataFrame
display(counts_05)
| sample | leiden_0.5 | type | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | ... | batch | doublet_scores | predicted_doublets | doublet_info | umap_density_sample | umap_density_leiden | leiden_1.0 | leiden_0.2 | leiden_0.05 | pernew | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PBMC-Sarc-1 | 0 | 1580 | 1580 | 1580 | 1580 | 1580 | 1580 | 1580 | 1580 | ... | 1580 | 1580 | 1580 | 1580 | 1580 | 1580 | 1580 | 1580 | 1580 | 23.522406 |
| 1 | PBMC-Sarc-1 | 1 | 772 | 772 | 772 | 772 | 772 | 772 | 772 | 772 | ... | 772 | 772 | 772 | 772 | 772 | 772 | 772 | 772 | 772 | 11.493226 |
| 2 | PBMC-Sarc-1 | 2 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | ... | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 813 | 12.103618 |
| 3 | PBMC-Sarc-1 | 3 | 741 | 741 | 741 | 741 | 741 | 741 | 741 | 741 | ... | 741 | 741 | 741 | 741 | 741 | 741 | 741 | 741 | 741 | 11.031711 |
| 4 | PBMC-Sarc-1 | 4 | 593 | 593 | 593 | 593 | 593 | 593 | 593 | 593 | ... | 593 | 593 | 593 | 593 | 593 | 593 | 593 | 593 | 593 | 8.828346 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 100 | PBMC-healthy-4 | 10 | 260 | 260 | 260 | 260 | 260 | 260 | 260 | 260 | ... | 260 | 260 | 260 | 260 | 260 | 260 | 260 | 260 | 260 | 2.257924 |
| 101 | PBMC-healthy-4 | 11 | 180 | 180 | 180 | 180 | 180 | 180 | 180 | 180 | ... | 180 | 180 | 180 | 180 | 180 | 180 | 180 | 180 | 180 | 1.563178 |
| 102 | PBMC-healthy-4 | 12 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | ... | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 126 | 1.094225 |
| 103 | PBMC-healthy-4 | 13 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | ... | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 0.191055 |
| 104 | PBMC-healthy-4 | 14 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | ... | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 0.026053 |
105 rows × 31 columns
Plot of Percentage Total Counts vs leiden_0.5 of all samples
plt.figure(figsize = (10,3))
ax = sns.barplot(x = 'leiden_0.5', y = 'pernew', hue = 'sample', data = counts_05)
ax.set(xlabel='leiden_0.5',
ylabel='Percenatge_total_counts')
plt.legend(bbox_to_anchor=(1.4,0.8))
<matplotlib.legend.Legend at 0x7f7efd303790>
Percentage Checking of Total Counts checking inside leiden_0.05
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Assuming 'merged' is your DataFrame
countsp_05 = adata.obs.groupby(['sample', 'leiden_0.05']).count().reset_index()
def map_per(row):
samp, y = row['sample'], row['total_counts']
totp05 = countsp_05.loc[countsp_05['sample'] == samp, 'total_counts'].sum()
return (y / totp05) * 100 if totp05 != 0 else 0
countsp_05['pernewpo5'] = countsp_05.apply(map_per, axis=1)
# Display the resulting DataFrame
display(countsp_05)
| sample | leiden_0.05 | type | percent_chrY | XIST-counts | n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | ... | batch | doublet_scores | predicted_doublets | doublet_info | umap_density_sample | umap_density_leiden | leiden_1.0 | leiden_0.5 | leiden_0.2 | pernewpo5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PBMC-Sarc-1 | 0 | 2861 | 2861 | 2861 | 2861 | 2861 | 2861 | 2861 | 2861 | ... | 2861 | 2861 | 2861 | 2861 | 2861 | 2861 | 2861 | 2861 | 2861 | 42.593420 |
| 1 | PBMC-Sarc-1 | 1 | 1725 | 1725 | 1725 | 1725 | 1725 | 1725 | 1725 | 1725 | ... | 1725 | 1725 | 1725 | 1725 | 1725 | 1725 | 1725 | 1725 | 1725 | 25.681108 |
| 2 | PBMC-Sarc-1 | 2 | 1136 | 1136 | 1136 | 1136 | 1136 | 1136 | 1136 | 1136 | ... | 1136 | 1136 | 1136 | 1136 | 1136 | 1136 | 1136 | 1136 | 1136 | 16.912312 |
| 3 | PBMC-Sarc-1 | 3 | 742 | 742 | 742 | 742 | 742 | 742 | 742 | 742 | ... | 742 | 742 | 742 | 742 | 742 | 742 | 742 | 742 | 742 | 11.046598 |
| 4 | PBMC-Sarc-1 | 4 | 221 | 221 | 221 | 221 | 221 | 221 | 221 | 221 | ... | 221 | 221 | 221 | 221 | 221 | 221 | 221 | 221 | 221 | 3.290159 |
| 5 | PBMC-Sarc-1 | 5 | 32 | 32 | 32 | 32 | 32 | 32 | 32 | 32 | ... | 32 | 32 | 32 | 32 | 32 | 32 | 32 | 32 | 32 | 0.476403 |
| 6 | PBMC-Sarc-2 | 0 | 3326 | 3326 | 3326 | 3326 | 3326 | 3326 | 3326 | 3326 | ... | 3326 | 3326 | 3326 | 3326 | 3326 | 3326 | 3326 | 3326 | 3326 | 35.937331 |
| 7 | PBMC-Sarc-2 | 1 | 1749 | 1749 | 1749 | 1749 | 1749 | 1749 | 1749 | 1749 | ... | 1749 | 1749 | 1749 | 1749 | 1749 | 1749 | 1749 | 1749 | 1749 | 18.897893 |
| 8 | PBMC-Sarc-2 | 2 | 2264 | 2264 | 2264 | 2264 | 2264 | 2264 | 2264 | 2264 | ... | 2264 | 2264 | 2264 | 2264 | 2264 | 2264 | 2264 | 2264 | 2264 | 24.462453 |
| 9 | PBMC-Sarc-2 | 3 | 1723 | 1723 | 1723 | 1723 | 1723 | 1723 | 1723 | 1723 | ... | 1723 | 1723 | 1723 | 1723 | 1723 | 1723 | 1723 | 1723 | 1723 | 18.616964 |
| 10 | PBMC-Sarc-2 | 4 | 151 | 151 | 151 | 151 | 151 | 151 | 151 | 151 | ... | 151 | 151 | 151 | 151 | 151 | 151 | 151 | 151 | 151 | 1.631551 |
| 11 | PBMC-Sarc-2 | 5 | 42 | 42 | 42 | 42 | 42 | 42 | 42 | 42 | ... | 42 | 42 | 42 | 42 | 42 | 42 | 42 | 42 | 42 | 0.453809 |
| 12 | PBMC-Sarc-3 | 0 | 3670 | 3670 | 3670 | 3670 | 3670 | 3670 | 3670 | 3670 | ... | 3670 | 3670 | 3670 | 3670 | 3670 | 3670 | 3670 | 3670 | 3670 | 44.936941 |
| 13 | PBMC-Sarc-3 | 1 | 2536 | 2536 | 2536 | 2536 | 2536 | 2536 | 2536 | 2536 | ... | 2536 | 2536 | 2536 | 2536 | 2536 | 2536 | 2536 | 2536 | 2536 | 31.051794 |
| 14 | PBMC-Sarc-3 | 2 | 1081 | 1081 | 1081 | 1081 | 1081 | 1081 | 1081 | 1081 | ... | 1081 | 1081 | 1081 | 1081 | 1081 | 1081 | 1081 | 1081 | 1081 | 13.236194 |
| 15 | PBMC-Sarc-3 | 3 | 649 | 649 | 649 | 649 | 649 | 649 | 649 | 649 | ... | 649 | 649 | 649 | 649 | 649 | 649 | 649 | 649 | 649 | 7.946614 |
| 16 | PBMC-Sarc-3 | 4 | 200 | 200 | 200 | 200 | 200 | 200 | 200 | 200 | ... | 200 | 200 | 200 | 200 | 200 | 200 | 200 | 200 | 200 | 2.448880 |
| 17 | PBMC-Sarc-3 | 5 | 31 | 31 | 31 | 31 | 31 | 31 | 31 | 31 | ... | 31 | 31 | 31 | 31 | 31 | 31 | 31 | 31 | 31 | 0.379576 |
| 18 | PBMC-healthy-1 | 0 | 2240 | 2240 | 2240 | 2240 | 2240 | 2240 | 2240 | 2240 | ... | 2240 | 2240 | 2240 | 2240 | 2240 | 2240 | 2240 | 2240 | 2240 | 38.660683 |
| 19 | PBMC-healthy-1 | 1 | 2277 | 2277 | 2277 | 2277 | 2277 | 2277 | 2277 | 2277 | ... | 2277 | 2277 | 2277 | 2277 | 2277 | 2277 | 2277 | 2277 | 2277 | 39.299275 |
| 20 | PBMC-healthy-1 | 2 | 550 | 550 | 550 | 550 | 550 | 550 | 550 | 550 | ... | 550 | 550 | 550 | 550 | 550 | 550 | 550 | 550 | 550 | 9.492579 |
| 21 | PBMC-healthy-1 | 3 | 519 | 519 | 519 | 519 | 519 | 519 | 519 | 519 | ... | 519 | 519 | 519 | 519 | 519 | 519 | 519 | 519 | 519 | 8.957542 |
| 22 | PBMC-healthy-1 | 4 | 201 | 201 | 201 | 201 | 201 | 201 | 201 | 201 | ... | 201 | 201 | 201 | 201 | 201 | 201 | 201 | 201 | 201 | 3.469106 |
| 23 | PBMC-healthy-1 | 5 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | ... | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 0.120815 |
| 24 | PBMC-healthy-2 | 0 | 999 | 999 | 999 | 999 | 999 | 999 | 999 | 999 | ... | 999 | 999 | 999 | 999 | 999 | 999 | 999 | 999 | 999 | 20.943396 |
| 25 | PBMC-healthy-2 | 1 | 2110 | 2110 | 2110 | 2110 | 2110 | 2110 | 2110 | 2110 | ... | 2110 | 2110 | 2110 | 2110 | 2110 | 2110 | 2110 | 2110 | 2110 | 44.234801 |
| 26 | PBMC-healthy-2 | 2 | 718 | 718 | 718 | 718 | 718 | 718 | 718 | 718 | ... | 718 | 718 | 718 | 718 | 718 | 718 | 718 | 718 | 718 | 15.052411 |
| 27 | PBMC-healthy-2 | 3 | 586 | 586 | 586 | 586 | 586 | 586 | 586 | 586 | ... | 586 | 586 | 586 | 586 | 586 | 586 | 586 | 586 | 586 | 12.285115 |
| 28 | PBMC-healthy-2 | 4 | 335 | 335 | 335 | 335 | 335 | 335 | 335 | 335 | ... | 335 | 335 | 335 | 335 | 335 | 335 | 335 | 335 | 335 | 7.023061 |
| 29 | PBMC-healthy-2 | 5 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | ... | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 0.461216 |
| 30 | PBMC-healthy-3 | 0 | 749 | 749 | 749 | 749 | 749 | 749 | 749 | 749 | ... | 749 | 749 | 749 | 749 | 749 | 749 | 749 | 749 | 749 | 20.183239 |
| 31 | PBMC-healthy-3 | 1 | 1760 | 1760 | 1760 | 1760 | 1760 | 1760 | 1760 | 1760 | ... | 1760 | 1760 | 1760 | 1760 | 1760 | 1760 | 1760 | 1760 | 1760 | 47.426570 |
| 32 | PBMC-healthy-3 | 2 | 714 | 714 | 714 | 714 | 714 | 714 | 714 | 714 | ... | 714 | 714 | 714 | 714 | 714 | 714 | 714 | 714 | 714 | 19.240097 |
| 33 | PBMC-healthy-3 | 3 | 264 | 264 | 264 | 264 | 264 | 264 | 264 | 264 | ... | 264 | 264 | 264 | 264 | 264 | 264 | 264 | 264 | 264 | 7.113985 |
| 34 | PBMC-healthy-3 | 4 | 73 | 73 | 73 | 73 | 73 | 73 | 73 | 73 | ... | 73 | 73 | 73 | 73 | 73 | 73 | 73 | 73 | 73 | 1.967125 |
| 35 | PBMC-healthy-3 | 5 | 151 | 151 | 151 | 151 | 151 | 151 | 151 | 151 | ... | 151 | 151 | 151 | 151 | 151 | 151 | 151 | 151 | 151 | 4.068984 |
| 36 | PBMC-healthy-4 | 0 | 4738 | 4738 | 4738 | 4738 | 4738 | 4738 | 4738 | 4738 | ... | 4738 | 4738 | 4738 | 4738 | 4738 | 4738 | 4738 | 4738 | 4738 | 41.146331 |
| 37 | PBMC-healthy-4 | 1 | 4234 | 4234 | 4234 | 4234 | 4234 | 4234 | 4234 | 4234 | ... | 4234 | 4234 | 4234 | 4234 | 4234 | 4234 | 4234 | 4234 | 4234 | 36.769431 |
| 38 | PBMC-healthy-4 | 2 | 1124 | 1124 | 1124 | 1124 | 1124 | 1124 | 1124 | 1124 | ... | 1124 | 1124 | 1124 | 1124 | 1124 | 1124 | 1124 | 1124 | 1124 | 9.761181 |
| 39 | PBMC-healthy-4 | 3 | 1067 | 1067 | 1067 | 1067 | 1067 | 1067 | 1067 | 1067 | ... | 1067 | 1067 | 1067 | 1067 | 1067 | 1067 | 1067 | 1067 | 1067 | 9.266175 |
| 40 | PBMC-healthy-4 | 4 | 330 | 330 | 330 | 330 | 330 | 330 | 330 | 330 | ... | 330 | 330 | 330 | 330 | 330 | 330 | 330 | 330 | 330 | 2.865827 |
| 41 | PBMC-healthy-4 | 5 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | ... | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 22 | 0.191055 |
42 rows × 31 columns
Plot of Percentage Total Counts vs leiden_0.05 of all samples
plt.figure(figsize = (10,3))
ax = sns.barplot(x = 'leiden_0.05', y = 'pernewpo5', hue = 'sample', data = countsp_05)
ax.set(xlabel='leiden_0.05',
ylabel='Percenatge_total_counts')
plt.legend(bbox_to_anchor=(1.4,0.8))
<matplotlib.legend.Legend at 0x7f7f3d0fd610>
#import scipy io package
from scipy import io
save_file = '/home/jana/Updated_SCANPY_QC_filtered_PBMC_for_Sarcoidosis.h5ad'
adata.write_h5ad(save_file)
#saving into CSV data
counts.to_csv('/home/jana/Merged_new_object_metadata.csv')
Delete all samples variables with merged object
del(adata, pbmcsarc1, pbmcsarc2, pbmcsarc3, pbmchealthy1, pbmchealthy2, pbmchealthy3, pbmchealthy4)